--- src/whisper.cpp	2025-10-31 16:34:53
+++ ../non_submodule_llamafile/whisper.cpp/whisper.cpp	2025-10-31 16:53:51
@@ -1,37 +1,26 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+#include "llama.cpp/ggml-vector.h"
+#include "llama.cpp/cores.h"
 #include "whisper.h"
 
-#ifdef WHISPER_USE_COREML
-#include "coreml/whisper-encoder.h"
-#endif
+#define GGML_USE_CUDA
+#define GGML_USE_METAL
 
 #ifdef GGML_USE_METAL
-#include "ggml-metal.h"
+#include "llama.cpp/ggml-metal.h"
 #endif
 
 #ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#include "whisper-mel-cuda.hpp"
+#include "llama.cpp/ggml-cuda.h"
+#include "whisper.cpp/whisper-mel-cuda.hpp"
 #endif
 
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
+#include "llama.cpp/ggml.h"
+#include "llama.cpp/ggml-alloc.h"
+#include "llama.cpp/ggml-backend.h"
 
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef GGML_USE_BLAS
-#include "ggml-blas.h"
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-#include "openvino/whisper-openvino-encoder.h"
-#endif
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
+#include "llamafile/llamafile.h"
 
 #include "whisper-mel.hpp"
 
@@ -226,7 +215,8 @@
 // and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
 // general-purpose kernels
 //
-static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
+static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y) {
+    int pad = 32;
     // use padding only if dimension 0 is at least 8 times larger than the padding
     // else we won't get much benefit from the optimization
     const int n_pad_req = 8;
@@ -249,7 +239,7 @@
 // TODO: check if other platforms can benefit from this optimization
 // TODO: CUDA is currently broken - seems ggml_mul_mat does not handle views correctly
 #if defined(GGML_USE_METAL)
-#define ggml_mul_mat ggml_mul_mat_pad
+#define ggml_mul_mat (llamafile_has_metal() ? ggml_mul_mat_pad : ggml_mul_mat)
 #endif
 
 // available whisper models
@@ -1085,18 +1075,18 @@
 }
 
 static uint32_t whisper_kv_cache_get_padding(const struct whisper_context & wctx) {
-    if (!wctx.params.flash_attn || !wctx.params.use_gpu) {
+    if (!wctx.params.flash_attn) {
         return 1u;
     }
 
 #ifdef GGML_USE_METAL
-    if (wctx.params.use_gpu) {
+    if (llamafile_has_metal()) {
         return 32u;
     }
 #endif
 
 #ifdef GGML_USE_CUDA
-    if (wctx.params.use_gpu) {
+    if (llamafile_has_cuda()) {
         return 256u;
     }
 #endif
@@ -1239,7 +1229,7 @@
     ggml_backend_t result = NULL;
 
 #ifdef GGML_USE_CUDA
-    if (params.use_gpu) {
+    if (llamafile_has_cuda()) {
         WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
         result = ggml_backend_cuda_init(params.gpu_device);
         if (!result) {
@@ -1249,7 +1239,7 @@
 #endif
 
 #ifdef GGML_USE_METAL
-    if (params.use_gpu) {
+    if (!result && llamafile_has_metal()) {
         WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
         ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
         result = ggml_backend_metal_init();
@@ -1317,14 +1307,14 @@
 static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) {
     ggml_backend_buffer_type_t result = nullptr;
 
-    params.use_gpu || (result = ggml_backend_cpu_buffer_type());
-
 #ifdef GGML_USE_CUDA
-    result || (result = ggml_backend_cuda_buffer_type(params.gpu_device));
+    if (!result && llamafile_has_cuda())
+        result = ggml_backend_cuda_buffer_type(params.gpu_device);
 #endif
 
 #ifdef GGML_USE_METAL
-    result || (result = ggml_backend_metal_buffer_type());
+    if (!result && llamafile_has_metal())
+        result = ggml_backend_metal_buffer_type();
 #endif
 
 #ifdef GGML_USE_SYCL
@@ -1335,7 +1325,8 @@
     result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
 #endif
 
-    result || (result = ggml_backend_cpu_buffer_type());
+    if (!result)
+        result = ggml_backend_cpu_buffer_type();
 
     return result;
 }
@@ -2784,7 +2775,7 @@
         whisper_context & wctx,
           whisper_state & wstate,
     const whisper_batch & batch,
-              const int   n_threads,
+                    int   n_threads,
                    bool   save_alignment_heads_QKs,
     ggml_abort_callback   abort_callback,
                    void * abort_callback_data) {
@@ -2875,6 +2866,11 @@
         }
 
         logits = gf->nodes[gf->n_nodes - 1];
+
+        if (batch.n_tokens < 16) {
+            if (n_threads > 23)
+                n_threads = 23;
+        }
 
         if (!ggml_graph_compute_helper(sched, gf, n_threads)) {
             return false;
@@ -3603,7 +3599,6 @@
 
 struct whisper_context_params whisper_context_default_params() {
     struct whisper_context_params result = {
-        /*.use_gpu              =*/ true,
         /*.flash_attn           =*/ false,
         /*.gpu_device           =*/ 0,
 
@@ -3708,7 +3703,8 @@
         params.dtw_token_timestamps = false;
     }
 
-    WHISPER_LOG_INFO("%s: use gpu    = %d\n", __func__, params.use_gpu);
+    WHISPER_LOG_INFO("%s: cuda gpu   = %d\n", __func__, llamafile_has_cuda());
+    WHISPER_LOG_INFO("%s: metal gpu  = %d\n", __func__, llamafile_has_metal());
     WHISPER_LOG_INFO("%s: flash attn = %d\n", __func__, params.flash_attn);
     WHISPER_LOG_INFO("%s: gpu_device = %d\n", __func__, params.gpu_device);
     WHISPER_LOG_INFO("%s: dtw        = %d\n", __func__, params.dtw_token_timestamps);
@@ -4782,7 +4778,7 @@
     struct whisper_full_params result = {
         /*.strategy          =*/ strategy,
 
-        /*.n_threads         =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
+        /*.n_threads         =*/ std::min(8, (int32_t) cpu_get_num_math()),
         /*.n_max_text_ctx    =*/ 16384,
         /*.offset_ms         =*/ 0,
         /*.duration_ms       =*/ 0,
@@ -5130,12 +5126,7 @@
         // populate the logprobs array (log_softmax)
         {
             const float logit_max = *std::max_element(logits.begin(), logits.end());
-            float logsumexp = 0.0f;
-            for (int i = 0; i < n_logits; ++i) {
-                if (logits[i] > -INFINITY) {
-                    logsumexp += expf(logits[i] - logit_max);
-                }
-            }
+            float logsumexp = ggml_vec_soft_max_f32(n_logits, 0, logits.data(), logit_max); // [jart]
             logsumexp = logf(logsumexp) + logit_max;
 
             for (int i = 0; i < n_logits; ++i) {
@@ -5155,14 +5146,14 @@
             {
                 float logsumexp = 0.0f;
                 const float logprob_max = *std::max_element(logprobs.begin() + vocab.token_beg, logprobs.end());
-                for (int i = vocab.token_beg; i < n_logits; ++i) {
-                    if (logprobs[i] > -INFINITY) {
-                        logsumexp += expf(logprobs[i] - logprob_max);
+                if (logprob_max > -INFINITY) {
+                    float logsumexp = ggml_vec_soft_max_f32(n_logits - vocab.token_beg, 0,
+                                                            &logprobs[vocab.token_beg],
+                                                            logprob_max); // [jart]
+                    if (logsumexp > 0.0f) {
+                        timestamp_logprob = logf(logsumexp) + logprob_max;
                     }
                 }
-                if (logsumexp > 0.0f) {
-                    timestamp_logprob = logf(logsumexp) + logprob_max;
-                }
             }
 
             const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
@@ -5181,12 +5172,7 @@
                     // populate the logprobs array (log_softmax)
                     {
                         const float logit_max = *std::max_element(logits.begin(), logits.end());
-                        float logsumexp = 0.0f;
-                        for (int i = 0; i < n_logits; ++i) {
-                            if (logits[i] > -INFINITY) {
-                                logsumexp += expf(logits[i] - logit_max);
-                            }
-                        }
+                        float logsumexp = ggml_vec_soft_max_f32(n_logits, 0, logits.data(), logit_max); // [jart]
                         logsumexp = logf(logsumexp) + logit_max;
 
                         for (int i = 0; i < n_logits; ++i) {
@@ -5203,15 +5189,7 @@
     }
 
     // compute probs
-    {
-        for (int i = 0; i < n_logits; ++i) {
-            if (logits[i] == -INFINITY) {
-                probs[i] = 0.0f;
-            } else {
-                probs[i] = expf(logprobs[i]);
-            }
-        }
-    }
+    ggml_vec_soft_max_f32(n_logits, &probs[0], logprobs.data(), 0); // [jart]
 
 #if 0
     // print first 100 logits - token string : logit
@@ -7462,6 +7440,8 @@
 static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
     (void) level;
     (void) user_data;
+    if (FLAG_log_disable)
+        return;
     fputs(text, stderr);
     fflush(stderr);
 }
